import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from escoring.enrichment_scoring import calculate_escores
from escoring.enrichment_scoring import permute, sig_interval
from escoring.support_funcs import load_sparse_h5, pairwise_similarities
from escoring.support_funcs import sig_dictionary
# This is the integrated (batch-corrected) expression matrix from atlas
HVG = load_sparse_h5("counts","./supp_data/hvg_integrated.h5")
type(HVG)
# Escoring accepts matrices where rows are cells and columns are genes. So one will need to transpose the matrix if the matrix is gene-by-cell
HVG = HVG.T
HVG.shape
fname = "./supp_data/hvg_ids.txt"
with open(fname, "r") as f:
hvg_names = [gene.strip("\n") for gene in f.readlines()]
f.close()
# Ramdomly select cells in region that is not confidently annotated by correlation-based and ici method
fname = "./supp_data/selected.cells.txt"
with open(fname, "r") as f:
r_cells = [int(cell.strip("\n")) for cell in f.readlines()]
f.close()
UMAP_50 = np.loadtxt("./supp_data/umap50.txt") # or load your preferred representation
UMAP = np.loadtxt("./supp_data/umap.txt")
# RBF kernel is chosen
metric = "rbf"
gamma = 0.8 # only use if laplacian, sigmoid or rbf and replace by wished value
S = pairwise_similarities(UMAP_50, r_cells, metric=metric,
metric_params={"gamma": gamma} # only use if needed
)
S.shape
# Calculate the enrichment scores
escores = calculate_escores(HVG, r_cells, S=S, optim_over="cols", scale_exp=False)
The escores dataframe is a dataframe of size genes x r_cells. The order of genes is preserved, so you can map them back to the indices of the genes in the original data. Take care here that in Python, counting starts at 0 and not 1. If you need any help here, let me know. Below, I manually set the gene names to the index.
escores.index = hvg_names
escores.index
# Permute the dataframe. This takes a little while.
n = 100 # how many times to permute the dataframe
seed = 42 # set this for reproducibility
P = permute(HVG, n=n, seed=seed)
pscores = calculate_escores(P, r_cells, S=S, optim_over="cols", scale_exp=False)
# Determine the significance cut-offs
n_sds = 5 # the number of SDs away from the mean for significance
cutoffs = sig_interval(pscores, n_sds=n_sds)
type(escores)
# Get a dictionary of significant genes per cell
sigs = sig_dictionary(escores, cutoffs)
sigs
df = pd.DataFrame({key: pd.Series(value) for key, value in sigs.items()})
df.to_csv("./supp_data/escoring.gene.nsds5.csv", encoding='utf-8', index=False)
escores.to_csv("./supp_data/escoring.gene.csv", encoding='utf-8', index=False)
for cell in r_cells:
top = escores.loc[:, cell].sort_values(ascending=False).index[0:3]
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)
fig.subplots_adjust(wspace=0.3, hspace=0.5)
fig.set_figheight(15)
fig.set_figwidth(20)
ax1.scatter(UMAP[:, 0], UMAP[:, 1], s=2, color="lightgrey")
ax1.scatter(UMAP[int(cell), 0], UMAP[int(cell), 1],
s=2, color="orange")
ax1.set_title(cell)
for t, ax in zip(top, [ax2, ax3, ax4]):
ti = list(hvg_names).index(t)
ax.scatter(UMAP[:, 0], UMAP[:, 1], s=2,
c=HVG[:, ti].data, cmap="Oranges")
ax.set_title(t)
plt.show()
print("\n")